******************************************************************************************************************************************************
*This do file code up the PCI data for analysis	
*	a.	Input files: 
*		i.	PCI_repeated_cross_CLEAN.dta
*		ii.	isic_alphabet_crosswalk.dta
*	b.	Output files:
*		i.	PCI_coded.dta													   																	    *
******************************************************************************************************************************************************

clear all
set more off
set mem 600m
set maxvar 3000


*SET DIRECTORY HERE*
global dir_data_original /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/original raw data/
global dir_data_coded /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/intermediary data/
global dir_data_analysis /Users/Jie/Dropbox (Personal)/CorruptionIncome/EJFinalSubmission/data&program/analysis data/

version 

**READ IN PCI DATA
use "${dir_data_coded}/PCI_repeated_cross_CLEAN.dta", clear

**FIRM TYPE
rename h5_1 equitizedlocalsoe 
rename h5_2 equitizedcentralsoe
rename h5_9 govholdshare

g formerSOE=1 if equitizedlocalsoe==1|equitizedcentralsoe==1
replace formerSOE=0 if equitizedlocalsoe==0&equitizedcentralsoe==0

*HARMONIZE YEARS FOR PCI VERSUS GSO
replace year = year-1
drop if isic_rev4_2digit==""

*GET BROAD INDUSTRY CODE
sort isic_rev4_2digit
tostring isic_rev4_2digit,replace
merge m:1 isic_rev4_2digit using "${dir_data_original}/Crosswalks/isic_alphabet_crosswalk.dta"
keep if _merge==3
drop _merge

drop if broad==""

**GENERATE GROUP VARIABLES
g r=pci_id
encode broad,g(j)	
g t=year

egen rjt = group(r j t)
egen rj = group(r j)
egen rt = group(r t)
egen jt = group(j t)

**CODE BRIBE VARIABLE
tab g4, mi

g bribe_pctrev_n = 0 if g4 == 7
replace bribe_pctrev_n = 0.5 if g4 == 6
replace bribe_pctrev_n = 1.5 if g4 == 5
replace bribe_pctrev_n = 6 if g4 == 4
replace bribe_pctrev_n = 15 if g4 == 3
replace bribe_pctrev_n = 25 if g4 == 2
replace bribe_pctrev_n = 35 if g4 == 1
rename bribe_pctrev_n bribe_pctrev

recode g3 (1 2 = 1) (3 4= 0), gen(bribe_common)

**CODE EMPLOYMENT VARIABLE
tab a9_3, mi
rename a9_3 employ
tab employ

g employ_n=.
replace employ_n = 3 if employ == 1
replace employ_n = 6.5 if employ == 2
replace employ_n = 19.3 if employ == 3
replace employ_n = 91.5 if employ == 4
replace employ_n = 240.8 if employ == 5
replace employ_n = 377 if employ == 6
replace employ_n = 683 if employ == 7
replace employ_n = 2082.6 if employ == 8

ren a9_2 employ_lag
ren a9_1 employ_establish


**SIZE OF BUSINESS PREMISES
recode b1 (-9=.)
g pieces_d=1 if year<2008
replace pieces_d=0 if year>2007
g lnprem = ln(b1)
replace lnprem=0 if pieces_d==1 //change in coding (piece to hectare)


**LURC AND LAND OWNERSHIP RIGHTS
replace b4_1=. if b4_1==.b|b4_1==.a						 //correct coding errors
replace b4_1_1=. if b4_1_1==.b|b4_1_1==.a
replace b4_1=1 if b4_1_1!=. & (b4_1==.|b4_1==0)	

g lurc=b4_1

replace ownland=1 if lurc==1	//coding errors mostly
replace lurc=0 if ownland==0

bys j: egen lurcj=mean(lurc)
bys j: egen ownlandj=mean(ownland)
g ownlandnlc =1 if ownland==1 & lurc==0
replace ownlandnlc=0 if ownland==0|(ownland==1 & lurc==1)
bys j: egen ownlandnlcj=mean(ownlandnlc)

***FIRM-YEAR LURC AND LURCLASTYR
g nextyr=year+1
ren lurc lurc_o
g lurc=lurc_o
replace lurc=0 if b4_1_1==nextyr & lurc_o==1 		//correct for one year lag in PCI
replace lurc=. if b4_1_1==. & lurc_o==1
g lastyr=year-1
g lurclastyr=lurc_o
replace lurclastyr=0 if (b4_1_1>lastyr & b4_1_1!=.) & lurc_o==1
replace lurclastyr=. if b4_1_1==. & lurc_o==1

**AGE OF FIRM
g yrsopen = year+1 - a1
bys j: egen yrsopenj=mean(yrsopen)

***PROFITS
g profit=a11_3
replace profit=. if profit==.b

**NUMBER OF DOCUMENTS
g numdocs=c3
replace numdocs = 15 if c3 > 15 & c3 != . //topcode it to remove outliers
bys j: egen numdocs95=pctile(c3) if !mi(c3),p(95)
g sharedocs=c3/numdocs95
replace sharedocs=1 if sharedocs>1 & sharedocs!=.

**HOUSEHOLD ENTERPRISE
ren h5_3 formerhhfirm 

***OWNER RELATIONSHIPS
ren h5_4 ownergov
ren h5_6 ownersoe

***INDUSTRIAL ZONE
tab iz,m

***PROPENSITY SCORE
logit lurc lnprem pieces_d yrsopen sharedocs formerhhfirm ownergov ownersoe formerSOE govholdshare
predict plurc
logit lurclastyr lnprem pieces_d yrsopen sharedocs formerhhfirm ownergov ownersoe formerSOE govholdshare
predict plurclastyr
logit ownland lnprem pieces_d yrsopen sharedocs formerhhfirm ownergov ownersoe formerSOE govholdshare
predict pownland

replace operations=. if operations==-1
ologit operations lnprem pieces_d yrsopen sharedocs formerhhfirm ownergov ownersoe formerSOE govholdshare
predict poperations
logit opdich lnprem pieces_d yrsopen sharedocs formerhhfirm ownergov ownersoe formerSOE govholdshare
predict popdich

***KEEP VARS
#delimit ; 

keep id year pci_id prov isic_rev4_2digit isic_rev4_4digit broad bribe_pctrev bribe_common employ employ_lag employ_establish
 lnprem pieces_d lurc lurc_o lurclastyr lurcj ownland ownlandj ownlandnlc ownlandnlcj yrsopen yrsopenj numdocs sharedocs
 formerhhfirm mobile ownergov ownersoe formerSOE govholdshare iz profit region employ_n operations opdich choose_other
 plurc plurc plurclastyr pownland poperations popdich;

save "${dir_data_coded}/PCI_coded.dta",replace

